package crawler.script;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.fastjson.JSONObject;

import crawler.client.sdk.task.java.CrawlerExtractInfo;
import crawler.client.sdk.task.java.CrawlerTaskInfo;
import crawler.utility.http.HttpClientResponse;
import crawler.utility.http.HttpClientUtil;

/**
 *  入口      https://www.jd.com/allSort.aspx
 * 
 *
 */
public class Step1 {
	
	private static Logger log = LoggerFactory.getLogger(Step1.class);
	
	public static CrawlerExtractInfo crawler(CrawlerTaskInfo crawlerTaskInfo) throws Exception {
		String nextTaskName = "crawler.script.Step2";
		
		String url =  crawlerTaskInfo.getTaskUrl();

		HttpClientUtil httpClientUtil = new HttpClientUtil();
		httpClientUtil.setUrl(url);
		
		HttpClientResponse responseBean = httpClientUtil.get();
		
		String html = responseBean.getContent();
		Document doc = Jsoup.parse(html);
		Elements elements = doc.select("div.category-item > div.mc > div.items > dl > dd > a");
		
		List<CrawlerTaskInfo> crawlerTaskInfos = new ArrayList<CrawlerTaskInfo>();
		CrawlerTaskInfo crawlerURL;
		
		String startStr = "https:" ;
		for (Element element : elements) {
			crawlerURL = new CrawlerTaskInfo();
			
			String childUrl = startStr + element.attr("href");	
			String name = element.text();
			crawlerURL.setChildUrl(childUrl);
			
			JSONObject info = new JSONObject();
			info.put("nameStep1", name);
			info.put("urlStep1", childUrl);
			crawlerURL.setInformation(info.toJSONString());
			
			
			crawlerURL.setTaskName(nextTaskName);
			crawlerURL.setGroupName("1");
			
			crawlerTaskInfos.add(crawlerURL);
		}
				
		return new CrawlerExtractInfo(crawlerTaskInfos);
	}
	
	
	public static void main(String[] args) throws Exception {
		CrawlerTaskInfo crawlerTaskInfo = new CrawlerTaskInfo();
		crawlerTaskInfo.setTaskUrl("https://www.jd.com/allSort.aspx");
		CrawlerExtractInfo crawlerExtractInfo = crawler(crawlerTaskInfo);
		List<CrawlerTaskInfo>   crawlerTaskInfos = crawlerExtractInfo.getCrawlerTaskInfos();
		for (int i = 0; i < crawlerTaskInfos.size(); i++) {
			System.out.println(crawlerTaskInfos.get(i));
		}
	}
}
